data <- readRDS(file="../data/sub_portugal_5provs_4blocks.rds") %>%
mutate_at(c("block", "prov", "clon", "tree"), as.factor) %>% # formatting data
mutate(age.sc = as.vector(scale(age, center = F))) # as age is definite on R+ I would only reduce it..
# mutate(age.sc = as.vector(scale(age))) # mean centering age
The dataset includes:
table(data$prov,data$block) %>% kable(caption = "Provenance against block number.")
| 34 | 35 | 36 | 38 | |
|---|---|---|---|---|
| LEI | 78 | 71 | 72 | 82 |
| MIM | 44 | 45 | 60 | 60 |
| PIE | 26 | 29 | 34 | 34 |
| SAC | 21 | 20 | 18 | 21 |
| VAL | 37 | 43 | 42 | 42 |
table(data$prov,as.factor(data$age)) %>% kable(caption = "Provenance against age (in months).")
| 11 | 15 | 20 | 27 | |
|---|---|---|---|---|
| LEI | 92 | 84 | 65 | 62 |
| MIM | 72 | 59 | 40 | 38 |
| PIE | 36 | 33 | 27 | 27 |
| SAC | 23 | 23 | 17 | 17 |
| VAL | 48 | 44 | 37 | 35 |
ggplot(data, aes(x=height)) +
geom_histogram(color="darkblue", fill="lightblue") +
theme_bw()
Height versus age.
ggplot(data, aes(x=height, color=as.factor(age))) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
theme_bw() +
facet_wrap(~as.factor(age)) +
theme(legend.position = "none")
Height versus age.
plot_grid(ggplot(data, aes(x=age,y=height)) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red") +
theme_bw() +
theme(axis.title=element_text(size=16)),
ggplot(data, aes(x=age,y=height)) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) +
theme_bw() +
theme(axis.title=element_text(size=16)))
Height versus age.
plot_grid(ggplot(data, aes(x=age,y=log(height))) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red") +
theme_bw() +
theme(axis.title=element_text(size=16)),
ggplot(data, aes(x=age,y=log(height))) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) +
theme_bw() +
theme(axis.title=element_text(size=16)))
Height versus age.
ggplot(data, aes(x=height, color=block)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
theme_bw() +
facet_grid(prov ~block, labeller = label_both) +
theme(legend.position = "none")
Height distribution by Provenance and Block.
Dummy variables for each level = Regularized intercepts, because we use weakly informative priors. But no information shared between intercepts. (See P299 in Statistical Rethinking of McElreath)
First, we are going to try three different likehoods: the normal distribution, the normal distribution with a log-transformed response variable and a lognormal distribution.
\[\begin{equation} \begin{aligned} h_{i} & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
Data in a list.
data.list <- list(N=length(data$height), # Number of observations
y=data$height, # Response variables
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
mN = stan_model("mN.stan")
fit.mN <- sampling(mN, data = data.list, iter = 2000, chains = 2, cores = 2)
broom::tidyMCMC(fit.mN,
pars = c("beta_age","beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mN with a normal distribution")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 137.17866 | 7.765476 | 122.445202 | 152.70549 | 0.9991825 | 1959 |
| beta_age2 | 151.12115 | 6.269038 | 138.942113 | 163.08563 | 1.0003679 | 1886 |
| alpha_prov[1] | 45.92467 | 7.030658 | 31.954684 | 60.12776 | 1.0006641 | 2118 |
| alpha_prov[2] | 30.10218 | 7.519204 | 15.621609 | 44.92392 | 0.9990958 | 2367 |
| alpha_prov[3] | 12.28498 | 8.350689 | -4.054990 | 28.57003 | 0.9997337 | 2286 |
| alpha_prov[4] | 19.50809 | 8.736821 | 3.121270 | 36.17416 | 0.9992251 | 2435 |
| alpha_prov[5] | 25.32921 | 8.092449 | 9.272715 | 41.51663 | 1.0005580 | 1987 |
| alpha_block[1] | 21.70303 | 7.232428 | 7.651517 | 35.02408 | 0.9991084 | 2160 |
| alpha_block[2] | 26.80213 | 7.552256 | 12.028854 | 41.00510 | 0.9990695 | 2507 |
| alpha_block[3] | 35.52770 | 7.707398 | 19.910170 | 50.20886 | 1.0002847 | 2281 |
| alpha_block[4] | 49.63100 | 7.928611 | 34.064540 | 65.26942 | 0.9990736 | 2585 |
| sigma_y | 141.49786 | 3.616364 | 134.394562 | 148.62190 | 0.9997112 | 2777 |
| lp__ | -5048.53803 | 2.441098 | -5054.499083 | -5045.06559 | 1.0029609 | 798 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y = data$height,
as.matrix(fit.mN, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
“In the plot above, the dark line is the distribution of the observed outcomes \(y\) and each of the 50 lighter lines is the kernel density estimate of one of the replications of \(y\) from the posterior predictive distribution (i.e., one of the rows in yrep).” From Graphical posterior predictive checks using the bayesplot package.
This plot makes it easy to see that this model poorly fits the data. We can probably do better…
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
data.list.mNlogy <- list(N=length(data$height), # Number of observations
y=log(data$height), # log(response variable)
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
Same stan code as the previous model!
mNlogy = stan_model("mNlogy.stan")
fit.mNlogy <- sampling(mNlogy, data = data.list.mNlogy, iter = 2000, chains = 2,
control=list(max_treedepth=14), cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogy,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogy with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0590213 | 0.3540056 | 2.3886657 | 3.7368434 | 0.9993254 | 625 |
| beta_age2 | -0.8529809 | 0.1703165 | -1.1809340 | -0.5366599 | 0.9991601 | 598 |
| alpha_prov[1] | 1.4173513 | 3.3803650 | -5.3612399 | 8.1904353 | 1.0028008 | 234 |
| alpha_prov[2] | 1.3639220 | 3.3799601 | -5.4099302 | 8.1415919 | 1.0028224 | 234 |
| alpha_prov[3] | 1.2581216 | 3.3792558 | -5.4902359 | 8.0397761 | 1.0027840 | 235 |
| alpha_prov[4] | 1.3757216 | 3.3806710 | -5.3966052 | 8.1362501 | 1.0027976 | 234 |
| alpha_prov[5] | 1.3735124 | 3.3787015 | -5.3767142 | 8.1289650 | 1.0028222 | 234 |
| alpha_block[1] | 2.2812688 | 3.3770891 | -4.5192376 | 9.1881944 | 1.0027868 | 233 |
| alpha_block[2] | 2.3041504 | 3.3761641 | -4.4665490 | 9.2467735 | 1.0027215 | 233 |
| alpha_block[3] | 2.3324488 | 3.3767774 | -4.4769462 | 9.2174757 | 1.0027396 | 233 |
| alpha_block[4] | 2.4199726 | 3.3777959 | -4.3854048 | 9.3615308 | 1.0027444 | 233 |
| sigma_y | 0.3970389 | 0.0097395 | 0.3786201 | 0.4166466 | 1.0022870 | 734 |
| lp__ | 372.5172291 | 2.4389120 | 366.8161804 | 375.9237097 | 1.0024838 | 595 |
There may be a warning about effective sample size. This warning should disappear if we increase the number of iterations to 2,500.
We have to increase the maximum treedepth: See the Brief Guide to Stan’s Warnings
lp has largely increased.
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
y_rep <- as.matrix(fit.mNlogy, pars = "y_rep")
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogy, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
A better fit than mN.
\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
We are going to use the same data list as in the first model mN.
mLogNR = stan_model("mLogNR.stan")
fit.mLogNR <- sampling(mLogNR, data = data.list, iter = 2000,
chains = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mLogNR,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mLogNR with a LogNormal distribution on R")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0307614 | 0.3328479 | 2.3505536 | 3.7293303 | 1.0048069 | 533 |
| beta_age2 | -0.8434647 | 0.1591522 | -1.1791151 | -0.5168304 | 1.0046194 | 538 |
| alpha_prov[1] | 1.1458945 | 3.3561201 | -4.7535557 | 7.9004600 | 1.0105964 | 267 |
| alpha_prov[2] | 1.1003799 | 3.3568042 | -4.7527428 | 7.8778128 | 1.0106404 | 266 |
| alpha_prov[3] | 0.9924494 | 3.3573113 | -4.8992699 | 7.7765604 | 1.0105778 | 266 |
| alpha_prov[4] | 1.1016616 | 3.3574769 | -4.7598206 | 7.8870408 | 1.0105980 | 267 |
| alpha_prov[5] | 1.1038531 | 3.3570309 | -4.7782228 | 7.8661618 | 1.0106182 | 266 |
| alpha_block[1] | 2.5720399 | 3.3659581 | -4.2569123 | 8.3785850 | 1.0110725 | 265 |
| alpha_block[2] | 2.6195725 | 3.3662227 | -4.2951219 | 8.4114883 | 1.0110646 | 265 |
| alpha_block[3] | 2.6304366 | 3.3661878 | -4.2271154 | 8.4196408 | 1.0111538 | 265 |
| alpha_block[4] | 2.7234038 | 3.3667849 | -4.1140769 | 8.5167146 | 1.0111019 | 265 |
| sigma_y | 0.3966131 | 0.0094509 | 0.3789919 | 0.4163500 | 0.9999027 | 819 |
| lp__ | 372.6195200 | 2.4430696 | 366.8167358 | 376.1387599 | 1.0080040 | 611 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y = data$height,
as.matrix(fit.mLogNR, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
In the previous models, our priors are very weakly informative. We can use a little more informative priors, that can help convergence and decrease the running time. Belox, we refit the model mNlogy (normal distribution with log(y)) with more informative priors.
Variance priors
Prior recommendations for scale parameters in hierarchical models
Very weakly informative prior: \(\sigma \sim \text{HalfCauchy}(0,25)\). From Gelman (2006): 8-schools example (p430). And here.
Weakly informative prior:
\(\sigma \sim \text{HalfCauchy}(0,1)\) (McElreath, First version) \(\sigma \sim \text{HalfCauchy}(0,5)\) (Betancourt in 8-schools example)
\(\sigma \sim \text{exponential}(1)\) (McElreath, Second version) or \(\sigma \sim \text{exponential}(0.1)\)
\(\sigma \sim \text{LogNormal}(0,1)\) (McElreath, Second version)
More informative prior : \(\sigma \sim \text{HalfNormal}(0,1)\) or \(\sigma \sim \text{Half-t}(3,0,1)\)
Beta priors
More informative priors: \(\beta \sim \mathcal{N}(0,1)\).
We can also consider that \(age\) is positively associated with height so we can add some information in our model and constrain \(\beta_{age}\) to positive values, such as: \(\beta_{age} \sim \text{LogNormal}(0,1)\) or \(\beta_{age} \sim \text{Gamma}(?,?)\).
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mNlogySigmaPrior = stan_model("mNlogySigmaPrior.stan")
fit.mNlogySigmaPrior <- sampling(mNlogySigmaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
control=list(max_treedepth=14), save_warmup = F)
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mNlogySigmaPrior,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0853939 | 0.3115853 | 2.5005104 | 3.6681742 | 1.002025 | 416 |
| beta_age2 | -0.8681602 | 0.1489561 | -1.1516606 | -0.5807643 | 1.001894 | 415 |
| alpha_prov[1] | 1.7237782 | 3.3001236 | -5.1851552 | 8.1666440 | 1.000452 | 278 |
| alpha_prov[2] | 1.6744454 | 3.3009885 | -5.2144130 | 8.0998992 | 1.000447 | 278 |
| alpha_prov[3] | 1.5646613 | 3.2997657 | -5.3127174 | 7.9979732 | 1.000442 | 278 |
| alpha_prov[4] | 1.7134032 | 3.2999423 | -5.1963587 | 8.1239492 | 1.000428 | 278 |
| alpha_prov[5] | 1.6902387 | 3.2998158 | -5.2190709 | 8.1101379 | 1.000448 | 278 |
| alpha_block[1] | 1.9336633 | 3.3087644 | -4.3830387 | 8.9411582 | 1.000624 | 279 |
| alpha_block[2] | 1.9742946 | 3.3093718 | -4.3784910 | 8.9704145 | 1.000629 | 279 |
| alpha_block[3] | 1.9966810 | 3.3086366 | -4.3427997 | 8.9796271 | 1.000621 | 278 |
| alpha_block[4] | 2.0963133 | 3.3092574 | -4.2326275 | 9.1269228 | 1.000629 | 278 |
| sigma_y | 0.3961707 | 0.0091660 | 0.3794562 | 0.4156804 | 1.002524 | 554 |
| lp__ | 372.4438071 | 2.3761946 | 366.6220138 | 375.7898736 | 1.000649 | 664 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogySigmaPrior, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,1)\\ \alpha_{PROV} & \sim \mathcal{N}(0,1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mNlogyBetaAlphaPrior = stan_model("mNlogyBetaAlphaPrior.stan")
fit.mNlogyBetaAlphaPrior <- sampling(mNlogyBetaAlphaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogyBetaAlphaPrior,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.1004531 | 0.2912053 | 2.4961041 | 3.6821798 | 1.002079 | 451 |
| beta_age2 | -0.8746115 | 0.1396886 | -1.1381178 | -0.5838175 | 1.002189 | 460 |
| alpha_prov[1] | 1.7130004 | 0.3515719 | 1.0019500 | 2.3930041 | 1.006129 | 217 |
| alpha_prov[2] | 1.6523240 | 0.3509656 | 0.9518173 | 2.3356440 | 1.006158 | 213 |
| alpha_prov[3] | 1.5548197 | 0.3501575 | 0.8462078 | 2.2350552 | 1.005868 | 214 |
| alpha_prov[4] | 1.6611466 | 0.3527107 | 0.9585417 | 2.3443441 | 1.006805 | 210 |
| alpha_prov[5] | 1.6611592 | 0.3514362 | 0.9584232 | 2.3594503 | 1.006014 | 216 |
| alpha_block[1] | 1.9893679 | 0.3521466 | 1.2864968 | 2.6990990 | 1.005293 | 206 |
| alpha_block[2] | 2.0245532 | 0.3527592 | 1.3196379 | 2.7259085 | 1.004878 | 206 |
| alpha_block[3] | 2.0361805 | 0.3519823 | 1.3313723 | 2.7511280 | 1.005202 | 204 |
| alpha_block[4] | 2.1429670 | 0.3528397 | 1.4271725 | 2.8361540 | 1.005711 | 211 |
| sigma_y | 0.3961433 | 0.0095731 | 0.3786008 | 0.4148985 | 1.000449 | 956 |
| lp__ | 351.9985186 | 2.5645317 | 345.5261730 | 355.5557700 | 1.000178 | 584 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogyBetaAlphaPrior, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
Comment: The intercept parameters change between the two models (mNlogyBetaAlphaPrior and mNlogySigmaPrior).
We use again the model mNlogy (normal distribution with log(y) and very weakly informative priors)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
mNlogyVectorize1 = stan_model("mNlogyVectorize1.stan")
## recompiling to avoid crashing R session
fit.mNlogyVectorize1 <- sampling(mNlogyVectorize1, data = data.list.mNlogy, iter = 2000,
control=list(max_treedepth=14),
chains = 2, cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogyVectorize1,
pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyVectorize1 with a normal distribution and log(y)")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0600624 | 0.3334058 | 2.3789984 | 3.7389229 | 1.0007327 | 403 |
| alpha_prov[1] | 1.5700576 | 3.3000869 | -4.6634773 | 8.1686080 | 1.0024234 | 249 |
| alpha_prov[2] | 1.4982032 | 3.2998967 | -4.7048100 | 8.1006768 | 1.0024422 | 249 |
| alpha_prov[3] | 1.4208945 | 3.3009499 | -4.8134431 | 8.0235527 | 1.0024733 | 249 |
| alpha_prov[4] | 1.5111111 | 3.3002768 | -4.7393854 | 8.1096778 | 1.0024572 | 249 |
| alpha_prov[5] | 1.5234478 | 3.3000213 | -4.6795873 | 8.1328925 | 1.0024566 | 249 |
| alpha_block[1] | 2.1617757 | 3.3119904 | -4.3750590 | 8.3947399 | 1.0022910 | 251 |
| alpha_block[2] | 2.1939238 | 3.3122452 | -4.3656572 | 8.4215537 | 1.0023326 | 251 |
| alpha_block[3] | 2.2109547 | 3.3125969 | -4.3607901 | 8.4132818 | 1.0023518 | 251 |
| alpha_block[4] | 2.3044364 | 3.3119205 | -4.2591138 | 8.5367959 | 1.0023170 | 251 |
| sigma_y | 0.3967513 | 0.0095748 | 0.3792346 | 0.4161848 | 1.0052156 | 464 |
| lp__ | 372.5540892 | 2.4441470 | 366.7206065 | 376.1479273 | 0.9996837 | 524 |
max_treedepth here !mNlogyVectorize2 = stan_model("mNlogyVectorize2.stan")
## recompiling to avoid crashing R session
fit.mNlogyVectorize2 <- sampling(mNlogyVectorize2, data = data.list.mNlogy, iter = 2000,
control=list(max_treedepth=14),
chains = 2, cores = 2, save_warmup = F)
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mNlogyVectorize2,
pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyVectorize2 with a normal distribution and log(y)")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 0.4079271 | 0.0145606 | 0.3819086 | 0.4384895 | 1.0017849 | 1054 |
| alpha_prov[1] | 2.7583218 | 3.2298185 | -3.6545509 | 9.6148244 | 1.0177140 | 179 |
| alpha_prov[2] | 2.7035882 | 3.2309829 | -3.7566788 | 9.6054385 | 1.0177927 | 178 |
| alpha_prov[3] | 2.6038981 | 3.2317715 | -3.8451136 | 9.5291288 | 1.0178303 | 178 |
| alpha_prov[4] | 2.7201621 | 3.2294372 | -3.6954186 | 9.5867217 | 1.0176730 | 178 |
| alpha_prov[5] | 2.7280783 | 3.2315433 | -3.6836257 | 9.5635812 | 1.0177782 | 178 |
| alpha_block[1] | 2.2278763 | 3.2309493 | -4.6737940 | 8.6385089 | 1.0177150 | 179 |
| alpha_block[2] | 2.2390444 | 3.2315742 | -4.6434048 | 8.6831788 | 1.0177958 | 179 |
| alpha_block[3] | 2.2673620 | 3.2303644 | -4.6117778 | 8.6788860 | 1.0177729 | 178 |
| alpha_block[4] | 2.3610523 | 3.2312457 | -4.5000585 | 8.7731852 | 1.0176530 | 179 |
| sigma_y | 0.4097010 | 0.0105303 | 0.3903519 | 0.4312985 | 1.0007078 | 999 |
| lp__ | -505.4200715 | 2.4391393 | -511.3613291 | -501.9341373 | 0.9996881 | 553 |
Let’s compare the speed of the 5 models with a normal distribution with log(y)
lapply(lapply(c(fit.mNlogy, fit.mNlogySigmaPrior,fit.mNlogyBetaAlphaPrior, fit.mNlogyVectorize1, fit.mNlogyVectorize2),
get_elapsed_time), data.frame) %>%
bind_rows(.id = "model") %>%
mutate(model = recode_factor(model,
"1" = "Model mNlogy with $\\sigma \\sim \\text{HalfCauchy}(0,25)$",
"2" = "Model mNlogySigmaPrior with $\\sigma \\sim \\text{Exponential}(1)$",
"3" = "Model mNlogyBetaAlphaPrior with $\\beta$ and $\\alpha \\sim \\mathcal{N}(0,1)$ ",
"4" = "Model mNlogyVectorize1",
"5" = "Model mNlogyVectorize2")) %>%
mutate(total = warmup + sample) %>%
arrange(total) %>%
kable(caption = "Model speed comparison of models with a normal distribution and a log-transformed response variable.")
| model | warmup | sample | total |
|---|---|---|---|
| Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) | 18.3864 | 22.5171 | 40.9035 |
| Model mNlogyVectorize1 | 20.4983 | 20.7858 | 41.2841 |
| Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) | 19.2170 | 23.9824 | 43.1994 |
| Model mNlogyVectorize1 | 19.9800 | 23.3768 | 43.3568 |
| Model mNlogyVectorize2 | 21.5884 | 23.1175 | 44.7059 |
| Model mNlogyVectorize2 | 24.0733 | 26.2528 | 50.3261 |
| Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) | 91.4763 | 100.6600 | 192.1363 |
| Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) | 91.1304 | 111.7290 | 202.8594 |
| Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) | 84.9207 | 119.5970 | 204.5177 |
| Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) | 97.5780 | 107.6160 | 205.1940 |
For the subsequent models, we will use the more informative priors of mNlogyBetaAlphaPrior.
Adaptive regularization
Links:
We are going to
P357 McElreath (first version).
mmNlogyC (multi-level model with normal distribution and log(y) - centered paramerization)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{PROV[p]}\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
data.list.reduced <- list(N=length(data$height), # Number of observations
y=log(data$height), # Log(Response variable)
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
prov=as.numeric(data$prov)) # Provenances
mmNlogyC <- stan_model("mmNlogyC.stan")
fit.mmNlogyC <- sampling(mmNlogyC, data = data.list.reduced , iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmNlogyC,
pars = c("beta_age", "beta_age2", "mean_alpha_prov", "sigma_alpha_prov",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmNlogyC with centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 2.8828267 | 0.3336757 | 2.2554695 | 3.5934452 | 1.009319 | 215 |
| beta_age2 | -0.7649118 | 0.1590296 | -1.1015075 | -0.4599303 | 1.008718 | 225 |
| mean_alpha_prov | 3.7947163 | 0.1667593 | 3.4459703 | 4.1223929 | 1.009050 | 245 |
| sigma_alpha_prov | 0.0589324 | 0.0520721 | 0.0169773 | 0.2208421 | 1.013428 | 336 |
| sigma_y | 0.4003032 | 0.0095925 | 0.3816535 | 0.4189197 | 1.001460 | 882 |
| lp__ | 362.8139984 | 2.5665427 | 356.7280601 | 366.4053299 | 1.000720 | 420 |
This model has some divergent transitions, low number of effective sample size and R-hat not exactly equal to one.
Divergent transitions
adapt_delta..McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”
You can also try to add some information in your model, for instance with \(\beta_{age} \sim \text{LogNormal}(0,1)\).
Or you can reparameterize your model with the non-centered parameterization. (Best option!!)
ppc_dens_overlay(y = log(data$height),
as.matrix(fit.mmNlogyC, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmNlogyC), pars = c("alpha_prov[3]","sigma_alpha_prov"),
np = nuts_params(fit.mmNlogyC)) +
xlab("Post-warmup iteration")
mcmc_pairs(as.array(fit.mmNlogyC), np = nuts_params(fit.mmNlogyC),
pars = c("beta_age","beta_age2","mean_alpha_prov","sigma_alpha_prov","alpha_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=2, div_shape = 19))
Links:
From McElreath, P429 (13.4.2.) of Statistical Rethinking (second version)
\[ \alpha \sim \mathcal{N}(\mu,\sigma)\]
is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= \mu + \beta\\ \beta &\sim \mathcal{N}(0,\sigma) \end{aligned} \end{equation}\]is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= \mu + z\sigma\\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]No parameters are left inside the prior.
From Updating: A Set of Bayesian Notes. Jeffrey B. Arnold. 20 Multilevel Models
These are two ways of writing the same model. However, they change the parameters that the HMC algorithm is actively sampling and thus can have different sampling performance.
However, neither is universally better.
And there is currently no ex-ante way to know which will work better, and at what amount of “data” that the performance of one or the other is better. However, one other reason to use the centered parameterization (if it is also scaled), is that the Stan HMC implementation tends to be more efficient if all parameters are on the scale.
mmNlogyNC (multi-level model with normal distribution and log(y) - non-centered paramerization)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{PROV[p]}\sigma_{PROV}\\ \alpha & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mmNlogyNC <- stan_model("mmNlogyNC.stan")
fit.mmNlogyNC <- sampling(mmNlogyNC, data = data.list.reduced, iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mmNlogyNC,
pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmNlogyNC with non-centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 2.9149973 | 0.2957180 | 2.3534367 | 3.5210070 | 1.0001809 | 702 |
| beta_age2 | -0.7827389 | 0.1418270 | -1.0677224 | -0.5123850 | 1.0001225 | 723 |
| z_prov[1] | 0.8092344 | 0.6503304 | -0.3516886 | 2.1785367 | 0.9997079 | 972 |
| z_prov[2] | 0.1130628 | 0.6218723 | -1.1537892 | 1.2977990 | 0.9992637 | 980 |
| z_prov[3] | -1.0182242 | 0.7372131 | -2.5842314 | 0.2909131 | 1.0001025 | 855 |
| z_prov[4] | 0.1585837 | 0.7259417 | -1.2876329 | 1.5393172 | 0.9991521 | 1350 |
| z_prov[5] | 0.1471047 | 0.6331136 | -1.0652745 | 1.4573545 | 0.9996255 | 1269 |
| sigma_prov | 0.0582236 | 0.0545279 | 0.0114027 | 0.1964875 | 1.0035889 | 491 |
| alpha | 3.7805073 | 0.1447384 | 3.4869954 | 4.0582316 | 0.9997202 | 725 |
| sigma_y | 0.3994355 | 0.0096618 | 0.3820791 | 0.4184778 | 1.0003311 | 1688 |
| lp__ | 348.6348255 | 3.0093531 | 341.4264827 | 353.0039259 | 1.0006348 | 392 |
No more divergent transitions! But still low effective sample size.
ppc_dens_overlay(y = log(data$height),
as.matrix(fit.mmNlogyNC, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmNlogyNC),
pars =c( "z_prov[3]","sigma_prov"),
np = nuts_params(fit.mmNlogyNC)) +
xlab("Post-warmup iteration")
## No divergences to plot.
mcmc_pairs(as.array(fit.mmNlogyNC), np = nuts_params(fit.mmNlogyNC),
pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=3, div_shape = 19))
For a lognormal distribution the non-centered parameterisation is different:
\[\alpha \sim \mathcal{logN}(log(\mu),\sigma)\]
is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= e^{log(\mu) + z.\sigma} \\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]mmLogNnc (multi-level LogNormal dsitribution - non-centered paramerization)
\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = e^{log(\alpha) + z_{PROV[p]}\sigma_{PROV}} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} \\ \alpha & \sim \text{LogNormal}(0,1) \\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
data.list.reduced.logN <- list(N=length(data$height), # Number of observations
y=data$height, # Response variable
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
prov=as.numeric(data$prov)) # Provenances
mmLogNnc<- stan_model("mmLogNnc.stan")
fit.mmLogNnc <- sampling(mmLogNnc, data = data.list.reduced.logN, iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmLogNnc,
pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmLogNnc with Lognormal non-centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 1.0779235 | 1.0213755 | -0.9255132 | 3.084179 | 0.9997807 | 1714 |
| beta_age2 | 2.2395739 | 1.0179360 | 0.2869805 | 4.208102 | 0.9995171 | 1721 |
| z_prov[1] | 0.6995530 | 0.7315178 | -0.7943672 | 2.082789 | 0.9992234 | 1026 |
| z_prov[2] | -0.2425250 | 0.7762592 | -1.8652479 | 1.309054 | 1.0003687 | 1270 |
| z_prov[3] | -0.5271000 | 0.8231962 | -2.2318216 | 1.068830 | 1.0058908 | 1297 |
| z_prov[4] | 0.1726138 | 0.8260796 | -1.4574600 | 1.793651 | 1.0007149 | 1462 |
| z_prov[5] | 0.2978564 | 0.7677986 | -1.3186946 | 1.735952 | 1.0008190 | 1483 |
| sigma_prov | 0.0493763 | 0.0848143 | 0.0023718 | 0.246020 | 1.0027449 | 314 |
| alpha | 314.6862899 | 17.5512995 | 285.3169758 | 337.572033 | 1.0088476 | 263 |
| sigma_y | 0.5778931 | 0.0135961 | 0.5520952 | 0.605530 | 0.9990399 | 1557 |
| lp__ | 16.0071436 | 2.9141755 | 9.2460113 | 20.432407 | 0.9998380 | 399 |
ppc_dens_overlay(y = data$height,
as.matrix(fit.mmLogNnc, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmLogNnc),
pars =c( "z_prov[3]","sigma_prov"),
np = nuts_params(fit.mmLogNnc)) +
xlab("Post-warmup iteration")
mcmc_pairs(as.array(fit.mmLogNnc), np = nuts_params(fit.mmLogNnc),
pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=3, div_shape = 19))